library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.1 ✔ dplyr 0.8.0.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
gardRelativeAbundance <- read_delim("/Users/hlberman/Documents/GitHub/pregnancy_metagenome/gard_mapping/clade_assignments/gardRelativeAbundance.tsv", delim = "\t") %>%
mutate(SampleID=as.character(SampleID),
Clade=as.character(Clade))
## Parsed with column specification:
## cols(
## SampleID = col_double(),
## Clade = col_double(),
## n = col_double(),
## propClade = col_double()
## )
alignmentsFinal <- read_delim("/Users/hlberman/Documents/GitHub/pregnancy_metagenome/gard_mapping/clade_assignments/aglingmentsFinal.tsv", delim = "\t")
## Parsed with column specification:
## cols(
## query = col_character(),
## pcntid = col_double(),
## SampleID = col_double(),
## SubjectID = col_double(),
## GWcoll = col_double(),
## RNAlater = col_character(),
## PaperCohort = col_double(),
## Annotation = col_character(),
## Clade = col_double()
## )
sgDF <- read_delim("/Users/hlberman/Documents/GitHub/pregnancy_metagenome/shotgunMetadata.tsv", delim = "\t")%>%
mutate(SampleID=as.character(SampleID))
## Parsed with column specification:
## cols(
## .default = col_double(),
## SampleIndex = col_character(),
## SampleType = col_character(),
## TermPre = col_character(),
## Operator = col_character(),
## PctTrim = col_character(),
## RNAlater = col_character(),
## Cohort = col_character()
## )
## See spec(...) for full column specifications.
#metaphlan data
#load("/Users/hlberman/Documents/GitHub/pregnancy_metagenome/methods_comparisons/PS_RData/metaphlan_PS.RData")
mDF0 <- read_tsv("/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/mergedMetaphlanAbundance.tsv") %>%
filter(str_detect(ID, "s__"), !str_detect(ID, "t__")) %>%
mutate(Species = str_extract(ID, "(?<=s__)\\w+$")) %>%
select(Species, everything(), -ID) %>%
as.data.frame()
## Parsed with column specification:
## cols(
## .default = col_double(),
## ID = col_character()
## )
## See spec(...) for full column specifications.
#make col and row names
samps <- colnames(mDF0[2:ncol(mDF0)]) %>%
str_extract("[0-9]{10}")
samps
## [1] "1000801248" "1000801318" "1000801368" "1001301158" "1001301248"
## [6] "1001301318" "1001801138" "1001801248" "1001801318" "1002701138"
## [11] "1002701158" "1002701278" "1003101118" "1003101188" "1003101278"
## [16] "1004001168" "1004001278" "1004001348" "1005601098" "1005601168"
## [21] "1005601348" "1050801308" "1050801318" "1050835178" "1050835218"
## [26] "1060435158" "1060435318" "1060435348" "1060801198" "1060801338"
## [31] "1060835148" "1061235118" "1061235188" "1061235278" "1061635088"
## [36] "1061635208" "1061635368" "1062101108" "1062101238" "1062101338"
## [41] "1062301128" "1062301198" "1062335178" "1062601228" "1062601258"
## [46] "1062601368" "1062701128" "1062701298" "1062735148" "1062735198"
## [51] "1062901138" "1062901218" "1062901318" "1900401198" "1900401288"
## [56] "1900401398" "1902401118" "1902401208" "1902401308" "2050501168"
## [61] "2050501298" "2050501348" "4002101198" "4002101298" "4002135018"
## [66] "4003235018" "4003235288" "4003235368" "4004235208" "4004235248"
## [71] "4004235268" "4004735328" "4004735368" "4004935158" "4004935268"
## [76] "4004935338" "4005935278" "4005935328" "4005935358" "4006635198"
## [81] "4006635258" "4006635278" "4006635338" "4006935208" "4006935248"
## [86] "4006935358" "4007135108" "4007135118" "4007135148" "4007135288"
## [91] "4007235168" "4007235278" "4007435168" "4007435248" "4007435258"
## [96] "4007535198" "4007535238" "4007535358" "4008434348" "4008435158"
## [101] "4008435358" "4009035168" "4009035268" "4009035368" "4009835178"
## [106] "4009835228" "4009835268"
colnames(mDF0) <- c("Species", samps)
rownames(mDF0) <- mDF0$Species
#Final edits to table
mDF <- mDF0 %>%
select(-Species) %>%
t() %>%
as.data.frame() %>%
mutate(SampleID=samps) %>%
select(SampleID, everything())
mDF[1:6,1:5]
## SampleID Actinobaculum_massiliense Actinobaculum_schaalii
## 1 1000801248 0 0
## 2 1000801318 0 0
## 3 1000801368 0 0
## 4 1001301158 0 0
## 5 1001301248 0 0
## 6 1001301318 0 0
## Actinobaculum_urinale Actinomyces_europaeus
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
Recalculate proportions of Gardnerella as proportions of the whole microbiome. (Rethink how this should be done… reads/total reads?) Currently done by multiplying times proportion of all Gardnerella found by Metaphlan
SampleIDlist <- rep(unique(sgDF$SampleID), 6) %>%
.[order(.)]
cladeFrame <- tibble(SampleID=SampleIDlist,
Clade=rep(c("1","2", "3", "4", "5", "6"), 107),
n=0,
propClade=0)
gardRelativeAbundance0 <- gardRelativeAbundance %>%
full_join(cladeFrame) %>%
group_by(SampleID, Clade) %>%
filter(n==max(n)) %>%
ungroup() %>%
left_join(unique.data.frame(sgDF[,c("SampleID", "SubjectID", "TermPre", "GWdell", "Cohort", "GWcoll")])) %>%
mutate(SubjectID=as.character(SubjectID))
## Joining, by = c("SampleID", "Clade", "n", "propClade")
## Joining, by = "SampleID"
Get Gardnerella relative abundances from metaphlan
# metaphlanGard <- PS_metaphlan_all@otu_table %>%
# t() %>%
# as.data.frame() %>%
# mutate(SampleID=rownames(.)) %>%
# select(Gardnerella_vaginalis, SampleID)
gardRelativeAbundance1 <- gardRelativeAbundance0 %>%
left_join(mDF[,c("SampleID", "Gardnerella_vaginalis")], by="SampleID") %>%
mutate(propCladeCommunity=propClade*Gardnerella_vaginalis)
Observe total proportion of other Gardnerella clades in the community vs. proportion of each clade
ggplot(gardRelativeAbundance1, aes(x=propCladeCommunity, y=Gardnerella_vaginalis-propCladeCommunity, color=Clade)) +
geom_point() +
scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73")) +
labs(x="Proportion of Gardnerella clade in community", y="Proportion of other Gardnerella clades in the community", color="Clade") +
facet_grid(Cohort~Clade) +
theme(axis.text.x = element_text(angle=45))
Not the best plot for visualizing the data, but it does show that when the total there is one clade of Garnerella in the community and the community is nearly 100% Gardnerella, that clade is clade 1.
Observe proportion of clades vs other clades
# reformat dataframe for this
cladexclade <- gardRelativeAbundance1 %>%
select(-propClade, -n) %>%
spread(Clade, propCladeCommunity) %>%
mutate(x1=`1`,
x2=`2`,
x3=`3`,
x4=`4`,
x5=`5`,
x6=`6`) %>%
gather("Clade1","propClade1", c(`1`,`2`,`3`,`4`,`5`,`6`)) %>%
gather("Clade2", "propClade2", c(x1, x2, x3, x4, x5, x6))%>%
mutate(Clade2=case_when(Clade2=="x1"~"1",
Clade2=="x2"~"2",
Clade2=="x3"~"3",
Clade2=="x4"~"4",
Clade2=="x5"~"5",
Clade2=="x6"~"6")) %>%
filter(Clade1!=Clade2)
#Stanford
stanford <- cladexclade %>%
filter(Cohort=="Stanford")
ggplot(stanford, aes(x=propClade1, y=propClade2))+
geom_point() +
facet_grid(Clade1~Clade2) +
coord_fixed() +
labs(title="Stanford", x="Proportion clade x", y="Proportion clade y") +
theme(axis.text.x = element_text(angle=45))
#UAB
stanford <- cladexclade %>%
filter(Cohort=="UAB")
ggplot(stanford, aes(x=propClade1, y=propClade2))+
geom_point() +
facet_grid(Clade1~Clade2) +
coord_fixed() +
labs(title="UAB", x="Proportion clade x", y="Proportion clade y") +
theme(axis.text.x = element_text(angle=45))
No clear patterns.
Observe proportion of each clade in each sample vs. term/preterm delivery
ggplot(data=gardRelativeAbundance1, aes(x=TermPre, y=propCladeCommunity, color=TermPre)) +
geom_jitter() +
geom_boxplot(alpha=0) +
scale_y_log10() +
scale_x_discrete(labels=c("Preterm", "Term")) +
scale_color_manual(values=c("#56B4E9", "#D55E00")) +
facet_grid(Cohort~Clade) +
labs(x=element_blank(), y="Proportion clade in community") +
theme(legend.position = "none")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 293 rows containing non-finite values (stat_boxplot).
Proportion of community vs preterm/term births faceted by clade. Does not suggest differences in proportion of each clade in each sample and term vs. preterm births. Perhaps the time series information is important here…
look at two specific samples
ggplot(gardRelativeAbundance1[gardRelativeAbundance1$SubjectID=="10031",], aes(x=GWcoll, y=propCladeCommunity, color=Clade, fill=Clade))+
geom_bar(stat = "identity")+
scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73")) +
scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73")) +
labs(title = "Subject 10031", subtitle = "Preterm")
ggplot(gardRelativeAbundance1[gardRelativeAbundance1$SubjectID=="10621",], aes(x=GWcoll, y=propCladeCommunity, color=Clade, fill=Clade))+
geom_bar(stat = "identity") +
scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73")) +
scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73")) +
labs(title="Subject 10621", subtitle = "Term", x="Gestation weeks at collection", y="Proportion of community")
Not going to draw any major conclusions from these two subjects (small n), but I just wanted to take a closer look since they have the highest proportion of Gardnerella and specifically clade 1.
Gestational frequency vs PTB
sgDF <- sgDF %>%
mutate(SubjectID=as.character(SubjectID))
gardRelativeAbundance1 %>%
mutate(pres=case_when(propCladeCommunity>0~1,
propCladeCommunity==0~0)) %>%
group_by(SubjectID, Clade) %>%
summarise(freq=mean(pres)) %>%
left_join(sgDF[,c("SubjectID", "Cohort", "TermPre")], by="SubjectID") %>% # add delivery time info
ggplot(., aes(x=TermPre, y=freq, color=TermPre))+
geom_jitter() +
geom_boxplot(alpha=0) +
scale_x_discrete(labels=c("Preterm", "Term")) +
scale_color_manual(values=c("#56B4E9", "#D55E00")) +
facet_grid(Cohort~Clade) +
theme(legend.position = "none") +
labs(x="Preterm vs. Term")
Prevalence of each clade vs ptb (proportion samples in each group with/without clade)
gardRelativeAbundance1 %>%
mutate(pres=case_when(propCladeCommunity>0~1,
propCladeCommunity==0~0)) %>%
group_by(Cohort, TermPre, Clade) %>%
summarise(Prevalence=mean(pres)) %>%
ggplot(., aes(x=TermPre, y=Prevalence, fill = TermPre)) +
geom_col() +
scale_x_discrete(labels=c("Preterm", "Term")) +
scale_fill_manual(values=c("#56B4E9", "#D55E00")) +
facet_grid(Cohort~Clade) +
labs(x="Preterm vs. Term")
Clade presence and overall Gardnerella relative abundance
ggplot(gardRelativeAbundance1, aes(x=(propCladeCommunity == 0), y=Gardnerella_vaginalis, color=(propCladeCommunity>0))) +
geom_jitter() +
geom_boxplot(alpha=0) +
scale_color_manual(values=c("#56B4E9", "#D55E00"), labels=c("Present", "Absent")) +
facet_grid(Cohort~Clade) +
labs(x="Clade presence", y="Total proportion of Gardnerella")+
scale_x_discrete(labels=c("Present", "Absent")) +
theme(legend.position = "none")
Will need to test, but it seems that presence of clade 1 may predict relative abundance of Gardnerella.
Should test for odds ratio of clade 1 presence and PTB risk. (The selection of samples may hinder our ability to do this. Should we focus more on clade presence and overall relative abundance of Gardnerella).
Remove 0 values and replace with NA so they don’t show up on the barplot
gardRelativeAbundance2 <- gardRelativeAbundance1 %>%
mutate(propCladeCommunity=na_if(propCladeCommunity, 0))
Plot
ggplot(data=gardRelativeAbundance2, mapping=aes(x=reorder(SampleID, -propCladeCommunity, FUN=sum, na.rm=TRUE), y=propCladeCommunity, fill=Clade, color=Clade)) +
geom_bar(stat="identity") +
labs(x="Sample", y="Proportion of community", color="Legend", fill="Legend") +
geom_point(aes(y=-.03, color=TermPre, fill=TermPre), shape="square", show.legend = FALSE) +
geom_point(aes(y=-.04, color=TermPre, fill=TermPre), shape="square", show.legend = FALSE) +
geom_point(aes(y=-.05, color=TermPre, fill=TermPre), shape="square", show.legend = FALSE) +
scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#000000", "#999999"), labels=c("Clade 1", "Clade2", "Clade 3", "Clade 4", "Clade 5", "Clade 6", "Preterm", "Term")) +
scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#000000", "#999999"), labels=c("Clade 1", "Clade2", "Clade 3", "Clade 4", "Clade 5", "Clade 6", "Preterm", "Term")) +
theme_classic() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.x = element_blank())
## Warning: Removed 293 rows containing missing values (position_stack).
Looks like we should test for a relationship between clade 1/2 presence and overall Gardnerella abundance.
ggplot(data=gardRelativeAbundance2, mapping=aes(x=reorder(reorder(SampleID, -propCladeCommunity, FUN=sum, na.rm=TRUE), TermPre=="T", na.rm=TRUE), y=propCladeCommunity, fill=Clade, color=Clade)) +
geom_bar(stat="identity") +
labs(x="Sample", y="Proportion of community", color="Legend", fill="Legend") +
geom_point(aes(y=-.03, color=TermPre, fill=TermPre), shape="square", show.legend = FALSE) +
geom_point(aes(y=-.04, color=TermPre, fill=TermPre), shape="square", show.legend = FALSE) +
geom_point(aes(y=-.05, color=TermPre, fill=TermPre), shape="square", show.legend = FALSE) +
scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#000000", "#999999"), labels=c("Clade 1", "Clade2", "Clade 3", "Clade 4", "Clade 5", "Clade 6", "Preterm", "Term")) +
scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#000000", "#999999"), labels=c("Clade 1", "Clade2", "Clade 3", "Clade 4", "Clade 5", "Clade 6", "Preterm", "Term")) +
facet_wrap(~Cohort) +
theme_classic() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.x = element_blank())
## Warning: Removed 293 rows containing missing values (position_stack).
A relationship with PTB might be more complicated. Consider working in methods that will include the time-series aspect of the data or subject average.
ggplot(data=gardRelativeAbundance2, mapping=aes(x=reorder(SampleID, TermPre=="T"), y=propCladeCommunity, fill=Clade, color=Clade)) +
geom_bar(stat="identity") +
labs(x="Sample", y="Proportion of community", color="Legend", fill="Legend") +
geom_point(aes(y=-.03, color=TermPre, fill=TermPre), shape="square", show.legend = FALSE) +
geom_point(aes(y=-.04, color=TermPre, fill=TermPre), shape="square", show.legend = FALSE) +
geom_point(aes(y=-.05, color=TermPre, fill=TermPre), shape="square", show.legend = FALSE) +
scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#000000", "#999999"), labels=c("Clade 1", "Clade2", "Clade 3", "Clade 4", "Clade 5", "Clade 6", "Preterm", "Term")) +
scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#000000", "#999999"), labels=c("Clade 1", "Clade2", "Clade 3", "Clade 4", "Clade 5", "Clade 6", "Preterm", "Term")) +
theme_classic() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.x = element_blank()) +
geom_line(aes(x=reorder(as.numeric(SubjectID), TermPre=="T", na.rm=TRUE), y=0))
## Warning: Removed 293 rows containing missing values (position_stack).
Separate bars by subject.
Caclulate average proportion of each clade per subject
cladeAverages <- gardRelativeAbundance1 %>%
data.frame(stringsAsFactors=FALSE) %>%
group_by(Clade, SubjectID) %>%
mutate(cladeAvg=mean(propCladeCommunity)) %>%
ungroup() %>%
select(SubjectID, Clade, cladeAvg, TermPre, Cohort, GWdell) %>%
unique.data.frame()
Bar plot of average clade proportion per subject.
cladeAverages %>%
mutate(cladeAvg=na_if(cladeAvg, 0)) %>%
ggplot(mapping=aes(x=reorder(reorder(SubjectID, -cladeAvg, FUN=sum, na.rm=TRUE), TermPre=="T", na.rm=TRUE), y=cladeAvg, fill=Clade, color=Clade)) +
geom_bar(stat = "identity") +
labs(x="Subject", y="Proportion", color="Legend", fill="Legend") +
geom_point(aes(y=-.03, color=TermPre, fill=TermPre), shape="square", size=5.5, show.legend = FALSE) +
geom_point(aes(y=-.04, color=TermPre, fill=TermPre), shape="square", size=5.5, show.legend = FALSE) +
geom_point(aes(y=-.05, color=TermPre, fill=TermPre), shape="square", size=5.5, show.legend = FALSE) +
scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#000000", "#999999"), labels=c("Clade 1", "Clade2", "Clade 3", "Clade 4", "Clade 5", "Clade 6", "Preterm", "Term")) +
scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#000000", "#999999"), labels=c("Clade 1", "Clade2", "Clade 3", "Clade 4", "Clade 5", "Clade 6", "Preterm", "Term")) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90),
axis.ticks.x = element_blank(),
axis.line.x = element_blank())
## Warning: Removed 75 rows containing missing values (position_stack).
Average proportion of each clade in the entire community vs. preterm/term:
ggplot(data=cladeAverages, aes(x=TermPre, y=cladeAvg, color=TermPre)) +
geom_jitter() +
geom_boxplot(alpha=0)+
scale_x_discrete(labels=c("Preterm", "Term")) +
scale_y_log10() +
scale_color_manual(values=c("#D55E00", "#56B4E9")) +
facet_grid(Cohort~Clade) +
labs(x=element_blank(), y="Proportion of community") +
theme(legend.position = "none")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 75 rows containing non-finite values (stat_boxplot).
May not be any differences in PTB risk by proportion of each clade in the entire community. Perhaps presence is a better predictor than proportion…
If the clade was present at any time in the subject
ggplot(cladeAverages, aes(x=(cladeAvg==0), fill=TermPre)) +
geom_bar(position="dodge") +
scale_fill_manual(values=c("#D55E00", "#56B4E9"), labels=c("Preterm", "Term")) +
facet_grid(Cohort~Clade) +
labs(x="Clade presence", fill="Delivery") +
scale_x_discrete(labels=c("Present", "Absent"))
Could clade 1 or 2 presence increase risk for PTB?
It appears that all subjects in the UAB cohort had Gardnerella clades 1, 2, and 4.
Look at the dominant Gardnerella clade in each sample (of those that have Gardnerella)
gardRelativeAbundance2 <- gardRelativeAbundance1 %>%
group_by(SampleID) %>%
filter(propClade==max(propClade)) %>%
ungroup()
ggplot(gardRelativeAbundance2) +
geom_point(aes(x=floor(GWcoll), y=reorder(SubjectID, GWdell), color=Clade)) +
geom_point(aes(y=SubjectID, x=GWdell+.5), shape=4, color="black") +
#geom_point(aes(x=0, y=SubjectID, color=Cohort)) +
scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#000000", "#999999")) +
geom_vline(xintercept = 37.5) +
labs(x="Gestation weeks at collection", y="Subject ID", title="Most abundant Gardnerella clade")
X marks gestation weeks at delivery. Vertical line is 37 weeks.
I think we need to test for clade 1 and/or 2 presence (especially clade 1) and total proportion of Gardnerella in the sample and also PTB risk. Without doing any statistics yet, it seems that presence of clade 1 predicts total proportion of Gardnerella. My initial hunch is that when you have this “BV-like” microbiome, abundance in Gardnerella, it is likely that these “pathogenic” strains are present, but you also may have these “non-pathogenic” strains in the community as well, and they may be more abundant than the “pathogenic” strains. The genomics information will be very interesting. Metatranscriptomic data would also be very interesting if it is true that these strains can reshape the community even if they are not the dominant strain of Gardnerella in the community!!
LGabundance <- gardRelativeAbundance1 %>%
left_join(mDF[,c("SampleID", "Lactobacillus_crispatus", "Lactobacillus_iners", "Lactobacillus_jensenii", "Lactobacillus_gasseri")], by="SampleID") %>%
gather("Lactobacillus", "LAbundance", c(Lactobacillus_crispatus, Lactobacillus_iners, Lactobacillus_jensenii, Lactobacillus_gasseri)) %>%
mutate(LAbundance=LAbundance/100,
Lactobacillus=case_when(Lactobacillus=="Lactobacillus_crispatus"~"Lactobacillus crispatus",
Lactobacillus=="Lactobacillus_gasseri"~"Lactobacillus gasseri",
Lactobacillus=="Lactobacillus_iners"~"Lactobacillus iners",
Lactobacillus=="Lactobacillus_jensenii"~"Lactobacillus jensenii"))
head(LGabundance)
## # A tibble: 6 x 13
## SampleID Clade n propClade SubjectID TermPre GWdell Cohort GWcoll
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1000801… 3 6 0.111 10008 T 41 Stanf… 25
## 2 1000801… 4 48 0.889 10008 T 41 Stanf… 25
## 3 1000801… 3 3 0.15 10008 T 41 Stanf… 32
## 4 1000801… 4 17 0.85 10008 T 41 Stanf… 32
## 5 1000801… 3 5 0.104 10008 T 41 Stanf… 37
## 6 1000801… 4 43 0.896 10008 T 41 Stanf… 37
## # … with 4 more variables: Gardnerella_vaginalis <dbl>,
## # propCladeCommunity <dbl>, Lactobacillus <chr>, LAbundance <dbl>
LGabundance %>%
filter(Cohort=="Stanford") %>%
ggplot(aes(x=propCladeCommunity, y=LAbundance, color=Lactobacillus)) +
geom_point() +
facet_grid(Lactobacillus~Clade) +
#coord_fixed() +
labs(title="Stanford", x="Proportion Gardnerella clade", y="Proportion Lactobacillus")
#ggsave("/Users/hlberman/Desktop/test1.pdf", height = 8, width=15)
LGabundance %>%
filter(Cohort=="UAB") %>%
ggplot(aes(x=propCladeCommunity, y=LAbundance, color=Lactobacillus)) +
geom_point() +
facet_grid(Lactobacillus~Clade) +
#coord_fixed() +
labs(title="UAB", x="Proportion Gardnerella clade", y="Proportion Lactobacillus")
#ggsave("/Users/hlberman/Desktop/test2.pdf", height = 8, width=15)
Mainly only see L. iners in UAB samples, as we have seen previously. Clear inverse relationship between most clades and Lactobacillus species, except clade 4 and L. gasseri in Stanford, and clades 1 and 3 and L. iners in UAB cohort.
LGabundance %>%
filter(Cohort=="Stanford") %>%
ggplot(aes(x=(propCladeCommunity == 0), y=LAbundance, color=(propCladeCommunity==0))) +
geom_jitter() +
geom_boxplot(alpha=0) +
scale_color_manual(values=c("#56B4E9", "#D55E00"), labels=c("Present", "Absent")) +
facet_grid(Lactobacillus~Clade) +
labs(title="Stanford", x="Clade presence", y="Proportion of Lactobacillus")+
scale_x_discrete(labels=c("Present", "Absent")) +
#scale_y_log10() +
theme(legend.position = "none")
LGabundance %>%
filter(Cohort=="UAB") %>%
ggplot(aes(x=(propCladeCommunity == 0), y=LAbundance, color=(propCladeCommunity==0))) +
geom_jitter() +
geom_boxplot(alpha=0) +
scale_color_manual(values=c("#56B4E9", "#D55E00"), labels=c("Present", "Absent")) +
facet_grid(Lactobacillus~Clade) +
labs(title="UAB", x="Clade presence", y="Proportion of Lactobacillus")+
scale_x_discrete(labels=c("Present", "Absent")) +
#scale_y_log10() +
theme(legend.position = "none")
Difficult to make any conclusions in the UAB cohort, because most samples do not have species other than L. iners. In the Stanford cohort, L. crispatus relative abundance may be greater without any Gardnerella clade. May not be a difference between L. iners relative abundance with and without Gardnerlla clades.
LGabundance0 <- LGabundance %>%
select(-n,-propClade) %>%
spread(Clade, propCladeCommunity) %>%
spread(Lactobacillus, LAbundance) %>%
gather("Taxon", "Abundance", `1`, `2`, `3`, `4`, `5`, `6`, `Lactobacillus crispatus`, `Lactobacillus gasseri`, `Lactobacillus iners`, `Lactobacillus jensenii`) %>%
filter(!is.na(Abundance))
ggplot(LGabundance0, aes(x=reorder(reorder(SampleID, -Abundance, FUN=sum, na.rm=TRUE), TermPre=="T", na.rm=TRUE), , y=Abundance, fill=Taxon, color=Taxon)) +
geom_bar(stat="identity") +
labs(x="Sample", y="Proportion of community", color="Legend", fill="Legend") +
geom_point(aes(y=-.03, color=TermPre, fill=TermPre), shape="square", show.legend = FALSE) +
geom_point(aes(y=-.04, color=TermPre, fill=TermPre), shape="square", show.legend = FALSE) +
geom_point(aes(y=-.05, color=TermPre, fill=TermPre), shape="square", show.legend = FALSE) +
scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "darkgreen", "#F0E442", "darkblue", "darkred", "#000000", "#999999"), labels=c("Gardnerella clade 1", "Gardnerella clade2", "Gardnerella clade 3", "Gardnerella clade 4", "Gardnerella clade 5", "Gardnerella clade 6", "Lactobacillus crispatus", "Lactobacillus gasseri", "Lactobacillus iners", "Lactobacillus jensenii", "Preterm", "Term")) +
scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "darkgreen", "#F0E442", "darkblue", "darkred", "#000000", "#999999"), labels=c("Gardnerella clade 1", "Gardnerella clade2", "Gardnerella clade 3", "Gardnerella clade 4", "Gardnerella clade 5", "Gardnerella clade 6", "Lactobacillus crispatus", "Lactobacillus gasseri", "Lactobacillus iners", "Lactobacillus jensenii", "Preterm", "Term")) +
theme_classic() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.x = element_blank())
# geom_line(aes(x=reorder(as.numeric(SubjectID), TermPre=="T", na.rm=TRUE), y=0))
Caclulate average proportion of each clade per subject
taxAverages <- LGabundance0 %>%
data.frame(stringsAsFactors=FALSE) %>%
group_by(Taxon, SubjectID) %>%
mutate(taxAvg=mean(Abundance)) %>%
ungroup() %>%
select(SubjectID, Taxon, taxAvg, TermPre, Cohort, GWdell) %>%
unique.data.frame()
Bar plot of average clade proportion per subject.
taxAverages %>%
mutate(taxAvg=na_if(taxAvg, 0)) %>%
ggplot(mapping=aes(x=reorder(reorder(SubjectID, -taxAvg, FUN=sum, na.rm=TRUE), TermPre=="T", na.rm=TRUE), y=taxAvg, fill=Taxon, color=Taxon)) +
geom_bar(stat = "identity") +
labs(x="Subject", y="Proportion", color="Legend", fill="Legend") +
geom_point(aes(y=-.03, color=TermPre, fill=TermPre), shape="square", size=5.5, show.legend = FALSE) +
geom_point(aes(y=-.04, color=TermPre, fill=TermPre), shape="square", size=5.5, show.legend = FALSE) +
geom_point(aes(y=-.05, color=TermPre, fill=TermPre), shape="square", size=5.5, show.legend = FALSE) +
scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "darkgreen", "#F0E442", "darkblue", "darkred", "#000000", "#999999"), labels=c("Gardnerella clade 1", "Gardnerella clade2", "Gardnerella clade 3", "Gardnerella clade 4", "Gardnerella clade 5", "Gardnerella clade 6", "Lactobacillus crispatus", "Lactobacillus gasseri", "Lactobacillus iners", "Lactobacillus jensenii", "Preterm", "Term")) +
scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "darkgreen", "#F0E442", "darkblue", "darkred", "#000000", "#999999"), labels=c("Gardnerella clade 1", "Gardnerella clade2", "Gardnerella clade 3", "Gardnerella clade 4", "Gardnerella clade 5", "Gardnerella clade 6", "Lactobacillus crispatus", "Lactobacillus gasseri", "Lactobacillus iners", "Lactobacillus jensenii", "Preterm", "Term")) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90),
axis.ticks.x = element_blank(),
axis.line.x = element_blank())
## Warning: Removed 164 rows containing missing values (position_stack).
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.0.1 purrr_0.3.2
## [5] readr_1.3.1 tidyr_0.8.3 tibble_2.1.1 ggplot2_3.1.1
## [9] tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.5 xfun_0.6 reshape2_1.4.3 haven_2.1.0
## [5] lattice_0.20-38 colorspace_1.4-1 generics_0.0.2 htmltools_0.3.6
## [9] yaml_2.2.0 utf8_1.1.4 rlang_0.3.4 pillar_1.3.1
## [13] glue_1.3.1 withr_2.1.2 modelr_0.1.4 readxl_1.3.1
## [17] plyr_1.8.4 munsell_0.5.0 gtable_0.3.0 cellranger_1.1.0
## [21] rvest_0.3.3 evaluate_0.13 labeling_0.3 knitr_1.22
## [25] fansi_0.4.0 broom_0.5.2 Rcpp_1.0.1 scales_1.0.0
## [29] backports_1.1.4 jsonlite_1.6 hms_0.4.2 digest_0.6.18
## [33] stringi_1.4.3 grid_3.5.2 cli_1.1.0 tools_3.5.2
## [37] magrittr_1.5 lazyeval_0.2.2 crayon_1.3.4 pkgconfig_2.0.2
## [41] xml2_1.2.0 lubridate_1.7.4 assertthat_0.2.1 rmarkdown_1.12
## [45] httr_1.4.0 rstudioapi_0.10 R6_2.4.0 nlme_3.1-139
## [49] compiler_3.5.2